In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as go
import seaborn as sns
import plotly.express as px
boxify_dataset = pd.read_csv(r"E:\DATA ANALYSIS\UpGrad\Capstone_Project\BOXIFY_UPGRAD_CAPSTONE PROJECT\Boxify Dataset.csv")
boxify_dataset.head()
Out[1]:
| Order | File_Type | SKU_number | SoldFlag | SoldCount | MarketingType | ReleaseNumber | New_Release_Flag | StrengthFactor | PriceReg | ReleaseYear | ItemCount | LowUserPrice | LowNetPrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | Historical | 1737127 | 0.0 | 0.0 | D | 15 | 1 | 682743.0 | 44.99 | 2015 | 8 | 28.97 | 31.84 |
| 1 | 3 | Historical | 3255963 | 0.0 | 0.0 | D | 7 | 1 | 1016014.0 | 24.81 | 2005 | 39 | 0.00 | 15.54 |
| 2 | 4 | Historical | 612701 | 0.0 | 0.0 | D | 0 | 0 | 340464.0 | 46.00 | 2013 | 34 | 30.19 | 27.97 |
| 3 | 6 | Historical | 115883 | 1.0 | 1.0 | D | 4 | 1 | 334011.0 | 100.00 | 2006 | 20 | 133.93 | 83.15 |
| 4 | 7 | Historical | 863939 | 1.0 | 1.0 | D | 2 | 1 | 1287938.0 | 121.95 | 2010 | 28 | 4.00 | 23.99 |
b. Summary of DataFrames¶
In [50]:
boxify_dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 198917 entries, 0 to 198916 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order 198917 non-null int64 1 File_Type 198917 non-null object 2 SKU_number 198917 non-null int64 3 SoldFlag 75996 non-null float64 4 SoldCount 75996 non-null float64 5 MarketingType 198917 non-null object 6 ReleaseNumber 198917 non-null int64 7 New_Release_Flag 198917 non-null int64 8 StrengthFactor 198917 non-null float64 9 PriceReg 198917 non-null float64 10 ReleaseYear 198917 non-null int64 11 ItemCount 198917 non-null int64 12 LowUserPrice 198917 non-null float64 13 LowNetPrice 198917 non-null float64 dtypes: float64(6), int64(6), object(2) memory usage: 21.2+ MB
c. Statistical Summary of Dataset¶
In [51]:
boxify_dataset.describe()
Out[51]:
| Order | SKU_number | SoldFlag | SoldCount | ReleaseNumber | New_Release_Flag | StrengthFactor | PriceReg | ReleaseYear | ItemCount | LowUserPrice | LowNetPrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 198917.000000 | 1.989170e+05 | 75996.000000 | 75996.000000 | 198917.000000 | 198917.000000 | 1.989170e+05 | 198917.000000 | 198917.000000 | 198917.000000 | 198917.000000 | 198917.000000 |
| mean | 106483.543242 | 8.613626e+05 | 0.171009 | 0.322306 | 3.412202 | 0.642248 | 1.117115e+06 | 90.895243 | 2006.016414 | 41.426283 | 30.982487 | 46.832053 |
| std | 60136.716784 | 8.699794e+05 | 0.376519 | 1.168615 | 3.864243 | 0.479340 | 1.522090e+06 | 86.736367 | 9.158331 | 37.541215 | 69.066155 | 128.513236 |
| min | 2.000000 | 5.000100e+04 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.275000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 55665.000000 | 2.172520e+05 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.614188e+05 | 42.000000 | 2003.000000 | 21.000000 | 4.910000 | 17.950000 |
| 50% | 108569.000000 | 6.122080e+05 | 0.000000 | 0.000000 | 2.000000 | 1.000000 | 5.822240e+05 | 69.950000 | 2007.000000 | 32.000000 | 16.080000 | 33.980000 |
| 75% | 158298.000000 | 9.047510e+05 | 0.000000 | 0.000000 | 5.000000 | 1.000000 | 1.430083e+06 | 116.000000 | 2011.000000 | 50.000000 | 40.240000 | 55.490000 |
| max | 208027.000000 | 3.960788e+06 | 1.000000 | 73.000000 | 99.000000 | 1.000000 | 1.738445e+07 | 12671.480000 | 2018.000000 | 2542.000000 | 14140.210000 | 19138.790000 |
In [89]:
#Analysing the number of missing values in the dataset
missing_values_summary=boxify_dataset.isnull().sum()
missing_values_summary
Out[89]:
Order 0 File_Type 0 SKU_number 0 SoldFlag 122921 SoldCount 122921 MarketingType 0 ReleaseNumber 0 New_Release_Flag 0 StrengthFactor 0 PriceReg 0 ReleaseYear 1 ItemCount 0 LowUserPrice 0 LowNetPrice 0 dtype: int64
In [90]:
#Filling the missing values of soldflag and soldCount with zero
boxify_dataset['SoldCount'].fillna(0, inplace=True)
boxify_dataset['SoldFlag'].fillna(0, inplace=True)
missing_values_summary=boxify_dataset.isnull().sum()
missing_values_summary
Out[90]:
Order 0 File_Type 0 SKU_number 0 SoldFlag 0 SoldCount 0 MarketingType 0 ReleaseNumber 0 New_Release_Flag 0 StrengthFactor 0 PriceReg 0 ReleaseYear 1 ItemCount 0 LowUserPrice 0 LowNetPrice 0 dtype: int64
b.Checking the Column Data types¶
In [5]:
boxify_dataset.dtypes
Out[5]:
Order int64 File_Type object SKU_number int64 SoldFlag float64 SoldCount float64 MarketingType object ReleaseNumber int64 New_Release_Flag int64 StrengthFactor float64 PriceReg float64 ReleaseYear int64 ItemCount int64 LowUserPrice float64 LowNetPrice float64 InventoryTurnover float64 dtype: object
In [5]:
# Changing the data types of ReleaseYear, MarketingType, SoldFlag, SoldCount and FileType
boxify_dataset['MarketingType'] = boxify_dataset['MarketingType'].astype('category')
boxify_dataset['New_Release_Flag'] = boxify_dataset['New_Release_Flag'].astype('category')
boxify_dataset['ReleaseYear'] = pd.to_datetime(boxify_dataset['ReleaseYear'], format='%Y', errors='coerce')
boxify_dataset['SKU_number'] = boxify_dataset['SKU_number'].astype('string')
boxify_dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 198917 entries, 0 to 198916 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order 198917 non-null int64 1 File_Type 198917 non-null object 2 SKU_number 198917 non-null string 3 SoldFlag 75996 non-null float64 4 SoldCount 75996 non-null float64 5 MarketingType 198917 non-null category 6 ReleaseNumber 198917 non-null int64 7 New_Release_Flag 198917 non-null category 8 StrengthFactor 198917 non-null float64 9 PriceReg 198917 non-null float64 10 ReleaseYear 198916 non-null datetime64[ns] 11 ItemCount 198917 non-null int64 12 LowUserPrice 198917 non-null float64 13 LowNetPrice 198917 non-null float64 dtypes: category(2), datetime64[ns](1), float64(6), int64(3), object(1), string(1) memory usage: 18.6+ MB
3. Exploratory Data Analysis¶
a. Frequency Distribution of File Types¶
In [36]:
fig = px.histogram(boxify_dataset.File_Type, x="File_Type",title="Distribution of File Types")
fig.update_layout(autosize=False,
width=400,
height=400)
fig.show()
b. Frequency Distribution of Marketing Type¶
In [39]:
custom_colors = ['#8481DD', '#F6D173'] # Add more colors if you have more categories
fig = px.pie(
boxify_dataset,
names='MarketingType',
title='Frequency Distribution of Marketing Types',
hole=0.4, # Optional: makes it a donut chart for better readability
color_discrete_sequence=custom_colors
)
fig.update_traces(textinfo='percent')
fig.update_layout(
autosize=False,
width=400,
height=400
)
fig.show()
In [ ]:
c. Sales Trend Over Time¶
In [73]:
# Aggregate sales by ReleaseYear
yearly_sales = valid_years.groupby('ReleaseYear')['SoldCount'].sum().reset_index()
# Plot the time series
fig = px.line(yearly_sales, x='ReleaseYear', y='SoldCount',
title='Yearly Sales Trends',
labels={'ReleaseYear': 'Year', 'SoldCount': 'Total Sold Count'})
fig.update_traces(mode='lines+markers', marker=dict(symbol='circle', size=5))
fig.update_layout(xaxis=dict(range=[1900, 2024]), autosize=False, width=600, height=600)
fig.show()
d. Sales By Marketing Types¶
In [40]:
market_compare = boxify_dataset.groupby(['File_Type','MarketingType'])[['Order']].count()
market_compare
C:\Users\hp\AppData\Local\Temp\ipykernel_16072\2634249690.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
Out[40]:
| Order | ||
|---|---|---|
| File_Type | MarketingType | |
| Active | D | 62852 |
| S | 60069 | |
| Historical | D | 35119 |
| S | 40877 |
In [56]:
market_com = market_compare.reset_index()
color_map = {
'D': '#7CC674',
'S': '#F4B678'
}
fig = px.bar(
market_com,
x='File_Type',
y='Order',
color='MarketingType',
title='Top Selling Products',
labels={'File_Type': 'File Types', 'Order': 'Order Count'},
barmode='group',
color_discrete_map=color_map
)
fig.update_traces(marker_line_color='black', marker_line_width=1, width=0.35)
fig.update_layout(title_text='The Order Counts for Active and Historical Products', title_x=0.5, autosize=False, width=600, height=600)
fig.show()
C:\Users\hp\anaconda3\Lib\site-packages\plotly\express\_core.py:1958: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
e. Marketing Type counts for SKUs that have and have not been sold in the last 6 months¶
In [76]:
marketing_counts = boxify_dataset.groupby(['SoldFlag', 'MarketingType']).size().reset_index(name='count')
custom_colors = ['#A2D9D9', '#F4B678']
fig0 = px.pie(marketing_counts[marketing_counts['SoldFlag'] == 0],
names='MarketingType', values='count',
title='Marketing Types for SKUs Not Sold in the Last 6 Months', hole = 0.4,
color_discrete_sequence=custom_colors)
# Filter data for SoldFlag = 1
fig1 = px.pie(marketing_counts[marketing_counts['SoldFlag'] == 1],
names='MarketingType', values='count',
title='Marketing Types for SKUs Sold in the Last 6 Months',hole = 0.4,
color_discrete_sequence=custom_colors)
fig0.update_layout(
autosize=False,
width=550,
height=400
)
fig1.update_layout(
autosize=False,
width=550,
height=400
)
fig0.show()
fig1.show()
C:\Users\hp\AppData\Local\Temp\ipykernel_16072\2290916386.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
f. Top-selling Products¶
In [76]:
top_selling_products = boxify_dataset.groupby('SKU_number')['SoldCount'].sum().reset_index()
# Sorting SoldCount in descending order and considering top 10 products
top_selling_products = top_selling_products.sort_values(by='SoldCount', ascending=False).head(10)
# Plotting the top-selling products
fig = px.bar(top_selling_products, x='SKU_number', y='SoldCount',
title='Top Selling Products',
labels={'SKU_number': 'SKU Number', 'SoldCount': 'Total Sold Count'})
fig.update_traces(marker_color='green', marker_line_color='black', marker_line_width=1.5)
fig.update_layout(title_text='Top 10 Selling Products', title_x=0.5, autosize=False, width=600, height=600)
fig.show()
g. Top Selling Categories¶
In [82]:
top_selling_products = boxify_dataset.groupby('MarketingType')['SoldCount'].sum().reset_index()
# Sorting SoldCount in descending order and considering top 10 products
top_selling_products = top_selling_products.sort_values(by='SoldCount', ascending=False).head(10)
# Plotting the top-selling products
fig = px.bar(top_selling_products, x='MarketingType', y='SoldCount',
title='Top Selling Products',
labels={'MarketingType': 'Marketing Type', 'SoldCount': 'Total Sold Count'})
fig.update_traces(marker_color='orange', marker_line_color='black', marker_line_width=1.5)
fig.update_layout(title_text='Top 10 Selling Products Marketing Type', title_x=0.5, autosize=False, width=600, height=600)
fig.show()
C:\Users\hp\AppData\Local\Temp\ipykernel_2904\2991398427.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
h. Comparison of Prices for the Hsitorical and Active products¶
In [121]:
price_comparison = boxify_dataset.groupby('File_Type')[['PriceReg', 'LowUserPrice', 'LowNetPrice']].mean().reset_index()
fig = px.bar(price_comparison, x='File_Type', y=['PriceReg', 'LowUserPrice', 'LowNetPrice'],
title='Top Selling Products',
labels={'File_Type': 'File Types'}, barmode='group')
fig.update_traces(marker_line_color='black', marker_line_width=1.5, width=0.25)
fig.update_layout(title_text='The Average Prices for Active and Historical Products', title_x=0.5, autosize=False, width=600, height=600)
fig.show()
i. Calculation of Stock Levels¶
In [128]:
current_stock_items = boxify_dataset.groupby('SKU_number')['ItemCount'].sum().reset_index()
current_stock_items
#To identify the stock_count below defined threshold value
low_stock_threshold = 10
low_stock_items = boxify_dataset[boxify_dataset['ItemCount'] < low_stock_threshold]
low_stocks_count = low_stock_items.shape[0]
low_stocks_count_percentage = (low_stocks_count/boxify_dataset.shape[0])*100
print(f"Low stock item count: {low_stocks_count}")
print(f"Low stock item count percentage: {low_stocks_count_percentage:.2f}%")
Low stock item count: 3379 Low stock item count percentage: 1.70%
In [149]:
#Plotting the number of current stocks
initial_stock = boxify_dataset.groupby('SKU_number')['ItemCount'].sum().reset_index()
# Calculate the total sold count for each SKU_number
total_sold = boxify_dataset.groupby('SKU_number')['SoldCount'].sum().reset_index()
# Merge initial stock and total sold count
stock_data = initial_stock.merge(total_sold, on='SKU_number', how='left')
stock_data['CurrentStock'] = stock_data['ItemCount'] - stock_data['SoldCount']
stock_data = stock_data.sort_values(by='CurrentStock', ascending=False).head(10)
fig = px.bar(stock_data, x='SKU_number', y='CurrentStock', title='Current Stock by SKU Number',
labels={'SKU_number': 'SKU Number', 'CurrentStock': 'Current Stock'})
# Show the plot
fig.update_traces(marker_color='#B323A5', marker_line_color='black', marker_line_width=1.5)
fig.update_layout(title_text='Number of Current Stocks for top-10 SKUs', title_x=0.5, autosize=False, width=600, height=600)
fig.show()
j. Calculating Inventory Turn-over¶
(i). Inventory turnover by SKU¶
In [53]:
boxify_dataset['InventoryTurnover'] = boxify_dataset['SoldCount'] / boxify_dataset['ItemCount']
boxify_dataset.replace([float('inf'), -float('inf')], float('nan'), inplace=True)
boxify_dataset.dropna(subset=['InventoryTurnover'], inplace=True)
boxify_turnOver_TopSKU=boxify_dataset.sort_values(by='InventoryTurnover', ascending=False).head(10)
# Plotting Inventory Turnover using Plotly
fig = px.bar(boxify_turnOver_TopSKU, x='SKU_number', y='InventoryTurnover',
title='Inventory Turnover by SKUs',
labels={'SKU_number': 'SKU Number', 'InventoryTurnover': 'Inventory Turnover Ratio'})
# Update the layout for better readability
fig.update_layout(xaxis={'categoryorder': 'total descending'}, showlegend=False,
title_x=0.5, autosize=False, width=600, height=600)
fig.update_traces(marker_color='#5c9f94', marker_line_color='black', marker_line_width=1.5)
# Show the plot
fig.show()
k. Total Stocks to Total Sales¶
In [23]:
# Calculate total inventory and total sales
total_inventory = boxify_dataset['ItemCount'].sum()
total_sales = boxify_dataset['SoldCount'].sum()
# Calculate stock-to-sales ratio
stock_to_sales_ratio = total_inventory / total_sales
print(f"Stock-to-Sales Ratio: {stock_to_sales_ratio:.2f}")
Stock-to-Sales Ratio: 33.60
In [48]:
boxify_dataset['stock_to_sales_ratio'] = boxify_dataset['ItemCount'].sum() / boxify_dataset['SoldCount'].sum()
stock_to_sales = boxify_dataset.groupby('ReleaseYear')['stock_to_sales_ratio'].sum().reset_index()
stock_to_sales
fig = px.line(stock_to_sales, x='ReleaseYear', y='stock_to_sales_ratio',
title='Stock-to-Sales Ratio by ReleaseYear',
labels={'ReleaseYear': 'Year', 'stock_to_sales_ratio': 'Stock-to-Sales Ratio'})
# Update the layout for better readability
fig.update_layout(xaxis= {'categoryorder': 'total descending'} , showlegend=False,
title_x=0.5, autosize=False, width=600, height=600)
fig.update_traces(mode='lines+markers', marker=dict(symbol='circle', size=5))
# Show the plot
fig.show()
C:\Users\hp\anaconda3\Lib\site-packages\_plotly_utils\basevalidators.py:106: FutureWarning: The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result
In [56]:
boxify_dataset['StockToSalesRatio'] = boxify_dataset['ItemCount'] / boxify_dataset['SoldCount'].replace(0, pd.NA)
# Handle infinite or NaN values
boxify_dataset.replace([float('inf'), -float('inf')], float('nan'), inplace=True)
boxify_dataset.dropna(subset=['StockToSalesRatio'], inplace=True)
boxify_stocksRatio = boxify_dataset.sort_values(by='StockToSalesRatio', ascending=False).head(10)
# Plotting Stock-to-Sales Ratio using Plotly
fig = px.bar(boxify_stocksRatio, x='SKU_number', y='StockToSalesRatio',
title='Stock-to-Sales Ratio by SKU',
labels={'SKU_number': 'SKU Number', 'StockToSalesRatio': 'Stock-to-Sales Ratio'})
# Update the layout for better readability
fig.update_layout(xaxis={'categoryorder': 'total descending'}, showlegend=False, autosize=False, width=600, height=600)
fig.update_traces(marker_color='#c7728c', marker_line_color='black', marker_line_width=1.5)
# Show the plot
fig.show()
l. Reorder Point¶
In [58]:
lead_time_days = 7
safety_stock_percentage = 0.2
# Calculate average daily demand
boxify_dataset['Average_Daily_Demand'] = boxify_dataset['SoldCount'] / 180 # the sales count is for six months
# Calculate demand during lead time
boxify_dataset['Demand_During_Lead_Time'] = boxify_dataset['Average_Daily_Demand'] * lead_time_days
# Calculate safety stock
boxify_dataset['Safety_Stock'] = boxify_dataset['Demand_During_Lead_Time'] * safety_stock_percentage
# Calculate reorder points
boxify_dataset['Reorder_Point'] = boxify_dataset['Demand_During_Lead_Time'] + boxify_dataset['Safety_Stock']
# Get the top 10 products by reorder points
top_10_reorder_points = boxify_dataset.nlargest(10, 'Reorder_Point')
fig = px.bar(top_10_reorder_points, x='SKU_number', y='Reorder_Point',
title='Top 10 Products by Reorder Points',
labels={'SKU_number':'SKU Number', 'Reorder_Point':'Reorder Point'})
fig.update_layout(xaxis={'categoryorder': 'total descending'}, showlegend=False,
title_x=0.5, autosize=False, width=600, height=600)
fig.update_traces(marker_color='#90be6d', marker_line_color='black', marker_line_width=1.5)
fig.show()
m. Top SKUs by Yearly ReorderPoint¶
In [29]:
#boxify_dataset['ReleaseYear'] = boxify_dataset['ReleaseYear'].astype(int)
# Calculate yearly sales for each SKU
yearly_sales = boxify_dataset.groupby(['SKU_number', 'ReleaseYear']).agg({'SoldCount': 'sum'}).reset_index()
# Calculate average yearly sales per SKU
average_sales = yearly_sales.groupby('SKU_number').agg({'SoldCount': 'mean'}).reset_index()
average_sales.rename(columns={'SoldCount': 'AvgYearlySales'}, inplace=True)
# Assume lead time in years (for simplicity, assume 1 year lead time)
lead_time = 1
# Calculate lead time demand
average_sales['LeadTimeDemand'] = average_sales['AvgYearlySales'] * lead_time
# Calculate standard deviation of sales per SKU to estimate variability
sales_variability = yearly_sales.groupby('SKU_number').agg({'SoldCount': 'std'}).reset_index()
sales_variability.rename(columns={'SoldCount': 'SalesStdDev'}, inplace=True)
reorder_data = pd.merge(average_sales, sales_variability, on='SKU_number')
# Assume a safety stock level (e.g., 1.65 for 95% service level)
safety_factor = 1.65
reorder_data['SafetyStock'] = safety_factor * reorder_data['SalesStdDev']
# Calculate reorder points
reorder_data['ReorderPoint'] = reorder_data['LeadTimeDemand'] + reorder_data['SafetyStock']
# Plotting reorder points for top 10 SKUs with highest reorder points
top_10_reorder_points = reorder_data.nlargest(10, 'ReorderPoint')
fig = px.bar(top_10_reorder_points, x='SKU_number', y='ReorderPoint', title='Top 10 SKUs by Yearly Reorder Point',
text=top_10_reorder_points['ReorderPoint'].map(lambda x: f'{x:.2f}'))
fig.update_layout(xaxis={'categoryorder': 'total descending'}, showlegend=False,
title_x=0.5, autosize=False, width=600, height=600)
fig.update_traces(marker_color='#ff6361', marker_line_color='black', marker_line_width=1.5, textposition='outside' )
fig.show()
top_10_reorder_points = top_10_reorder_points.merge(yearly_sales[['SKU_number', 'ReleaseYear']], on='SKU_number', how='left')
top_10_reorder_points
Out[29]:
| SKU_number | AvgYearlySales | LeadTimeDemand | SalesStdDev | SafetyStock | ReorderPoint | ReleaseYear | |
|---|---|---|---|---|---|---|---|
| 0 | 2271238 | 1.5 | 1.5 | 2.121320 | 3.500179 | 5.000179 | 1900-01-01 |
| 1 | 2271238 | 1.5 | 1.5 | 2.121320 | 3.500179 | 5.000179 | 2012-01-01 |
| 2 | 3545620 | 1.5 | 1.5 | 2.121320 | 3.500179 | 5.000179 | 1900-01-01 |
| 3 | 3545620 | 1.5 | 1.5 | 2.121320 | 3.500179 | 5.000179 | 2000-01-01 |
| 4 | 2264246 | 1.0 | 1.0 | 1.414214 | 2.333452 | 3.333452 | 1900-01-01 |
| 5 | 2264246 | 1.0 | 1.0 | 1.414214 | 2.333452 | 3.333452 | 2011-01-01 |
| 6 | 3604079 | 1.0 | 1.0 | 1.414214 | 2.333452 | 3.333452 | 1900-01-01 |
| 7 | 3604079 | 1.0 | 1.0 | 1.414214 | 2.333452 | 3.333452 | 2001-01-01 |
| 8 | 1781265 | 0.5 | 0.5 | 0.707107 | 1.166726 | 1.666726 | 2015-01-01 |
| 9 | 1781265 | 0.5 | 0.5 | 0.707107 | 1.166726 | 1.666726 | 2016-01-01 |
| 10 | 2263930 | 0.5 | 0.5 | 0.707107 | 1.166726 | 1.666726 | 1900-01-01 |
| 11 | 2263930 | 0.5 | 0.5 | 0.707107 | 1.166726 | 1.666726 | 2011-01-01 |
| 12 | 2264421 | 0.5 | 0.5 | 0.707107 | 1.166726 | 1.666726 | 1900-01-01 |
| 13 | 2264421 | 0.5 | 0.5 | 0.707107 | 1.166726 | 1.666726 | 2011-01-01 |
| 14 | 2265599 | 0.5 | 0.5 | 0.707107 | 1.166726 | 1.666726 | 1900-01-01 |
| 15 | 2265599 | 0.5 | 0.5 | 0.707107 | 1.166726 | 1.666726 | 2011-01-01 |
| 16 | 2268109 | 0.5 | 0.5 | 0.707107 | 1.166726 | 1.666726 | 1900-01-01 |
| 17 | 2268109 | 0.5 | 0.5 | 0.707107 | 1.166726 | 1.666726 | 2010-01-01 |
| 18 | 270587 | 0.5 | 0.5 | 0.707107 | 1.166726 | 1.666726 | 2011-01-01 |
| 19 | 270587 | 0.5 | 0.5 | 0.707107 | 1.166726 | 1.666726 | 2012-01-01 |
In [92]:
fig = px.histogram(boxify_dataset,
x="New_Release_Flag",
color="New_Release_Flag",
facet_col="SoldFlag",
title="SoldFlag Counts for SKUs with and without Plans for Future Release",
labels={"New_Release_Flag": "New Release Flag", "count": "Count"},
category_orders={"SoldFlag": [0, 1]},
color_discrete_sequence=["#009596", "#C9190B"])
# Update layout for better visualization
fig.update_layout(
barmode='group',
height=600,
width=1000)
C:\Users\hp\anaconda3\Lib\site-packages\plotly\express\_core.py:1958: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
o. Relation between Marketing Type and Price by Sales Performance¶
In [97]:
color_map = {
0: '#9b19f5',
1: '#ffa300'
}
# Plotting PriceReg vs MarketingType
fig_price_reg = px.box(boxify_dataset,
x="MarketingType",
y="PriceReg",
color="SoldFlag",
title="PriceReg vs MarketingType",
labels={"MarketingType": "Marketing Type", "PriceReg": "PriceReg"},
category_orders={"SoldFlag": [0, 1]},
color_discrete_map=color_map)
fig_price_reg.update_yaxes(range=[0, 750])
fig_price_reg.update_layout(title_x=0.5, autosize=False, width=600, height=600)
fig_price_reg.show()
# Plotting LowUserPrice vs MarketingType
fig_low_user_price = px.box(boxify_dataset,
x="MarketingType",
y="LowUserPrice",
color="SoldFlag",
title="LowUserPrice vs MarketingType",
labels={"MarketingType": "Marketing Type", "LowUserPrice": "LowUserPrice"},
category_orders={"SoldFlag": [0, 1]},
color_discrete_map=color_map)
fig_low_user_price.update_yaxes(range=[0, 750])
fig_low_user_price.update_layout(title_x=0.5, autosize=False, width=600, height=600)
fig_low_user_price.show()
# Plotting LowNetPrice vs MarketingType
fig_low_net_price = px.box(boxify_dataset,
x="MarketingType",
y="LowNetPrice",
color="SoldFlag",
title="LowNetPrice vs MarketingType",
labels={"MarketingType": "Marketing Type", "LowNetPrice": "LowNetPrice"},
category_orders={"SoldFlag": [0, 1]},
color_discrete_map=color_map)
fig_low_net_price.update_yaxes(range=[0, 750])
fig_low_net_price.update_layout(title_x=0.5, autosize=False, width=600, height=600)
fig_low_net_price.show()
p. Relationship among each variables¶
In [130]:
# Select only numeric columns from the DataFrame
numeric_data = boxify_dataset.select_dtypes(include=['float64', 'int64'])
# Calculate the correlation matrix
corr_matrix = numeric_data.corr()
# Create a Plotly heatmap
fig = go.Figure(data=go.Heatmap(
z=corr_matrix.values,
x=corr_matrix.columns,
y=corr_matrix.columns,
colorscale='PiYG',
zmin=-1, zmax=1,
text=corr_matrix.values,
texttemplate="%{text:.2f}",
textfont={"size":10},
))
# Update layout
fig.update_layout(
title='Correlation Heatmap',
xaxis_nticks=36,
yaxis_nticks=36,
title_x=0.5, autosize=False, width=700, height=700
)
fig.show()
q. Item Count Distribution and Sales Performance¶
In [157]:
item_count_sales = boxify_dataset.groupby('ItemCount')['SoldCount'].sum().reset_index()
item_count_avg_sales = boxify_dataset.groupby('ItemCount')['SoldCount'].mean().reset_index()
# Merge the two dataframes
item_count_summary = pd.merge(item_count_sales, item_count_avg_sales, on='ItemCount', suffixes=('_Total', '_Average'))
# Plot the distribution and average of ItemCount in relation to sales performance
fig = px.scatter(
item_count_summary,
x='ItemCount',
y=['SoldCount_Total'],
title='Item Count Distribution and Sales Performance',
labels={'ItemCount': 'Item Count', 'SoldCount': 'Sales Count'},
opacity=0.9
)
fig.update_layout(title_x=0.5, autosize=False, width=700, height=700)
fig.update_traces(marker=dict(size=5))
fig.show()